library(data.table)
library(lfe)
library(ggplot2)
library(stringr)


analyzeBK <- function() {
  d.in <- fread('../data/black_knight_to_analyze.csv') # Black knight data
  d.in[,conf.pct := OriginalLoanAmount / conforming_limit]
  d.in[,conf.bucket := cut(conf.pct,breaks = c(0,seq(.5+.01,1.5+.01,by = 0.05)),labels = c(seq(.5,1.5,by = 0.05)))]
  d.in <- d.in[!is.na(OriginalLTV) & !is.na(OriginalCreditScore) & !is.na(OriginalInterestRate)]
  d.in[,conf.bucket := relevel(conf.bucket,ref = "1")]
  
  r.full <- rateRegression(d.in)
  r.2008 <- rateRegression(d.in[origination.year == 2008])
  r.2012 <- rateRegression(d.in[origination.year == 2012])
  r.2016 <- rateRegression(d.in[origination.year == 2016])
  
  ggplot(r.full[b >= .6 & b <= 1.4]) + geom_point(aes(x=b,y=c)) + geom_ribbon(aes(x=b,ymax = c + 2*s,ymin = c-2*s),alpha=.2) + theme_bw() + coord_cartesian(ylim = c(-.25,.5)) +
    geom_vline(xintercept = 1,linetype = 'dashed') + geom_hline(yintercept = 0,linetype = 'dashed') + xlab(NULL) + ylab(NULL) + scale_y_continuous(labels = function(x) {paste0(x,'%')})
  ggsave('../output/spread_full.png',height=3,width=5,units = 'in')
  ggplot(r.2008[b >= .6 & b <= 1.4]) + geom_point(aes(x=b,y=c)) + geom_ribbon(aes(x=b,ymax = c + 2*s,ymin = c-2*s),alpha=.2) + theme_bw() + coord_cartesian(ylim = c(-.25,.5)) +
    geom_vline(xintercept = 1,linetype = 'dashed') + geom_hline(yintercept = 0,linetype = 'dashed') + xlab(NULL) + ylab(NULL) + scale_y_continuous(labels = function(x) {paste0(x,'%')})
  ggsave('../output/spread_2008.png',height=3,width=5,units = 'in')
  ggplot(r.2012[b >= .6 & b <= 1.4]) + geom_point(aes(x=b,y=c)) + geom_ribbon(aes(x=b,ymax = c + 2*s,ymin = c-2*s),alpha=.2) + theme_bw() + coord_cartesian(ylim = c(-.25,.5)) +
    geom_vline(xintercept = 1,linetype = 'dashed') + geom_hline(yintercept = 0,linetype = 'dashed') + xlab(NULL) + ylab(NULL) + scale_y_continuous(labels = function(x) {paste0(x,'%')})
  ggsave('../output/spread_2012.png',height=3,width=5,units = 'in')
  ggplot(r.2016[b >= .6 & b <= 1.4]) + geom_point(aes(x=b,y=c)) + geom_ribbon(aes(x=b,ymax = c + 2*s,ymin = c-2*s),alpha=.2) + theme_bw() + coord_cartesian(ylim = c(-.25,.5)) +
    geom_vline(xintercept = 1,linetype = 'dashed') + geom_hline(yintercept = 0,linetype = 'dashed') + xlab(NULL) + ylab(NULL) + scale_y_continuous(labels = function(x) {paste0(x,'%')})
  ggsave('../output/spread_2016.png',height=3,width=5,units = 'in')
  
  # Spreads by year
  conf <- d.in[JumboAtOriginationFlag == 0,j=list(conf=mean(OriginalInterestRate,na.rm=T)),by=c('origination.year')]
  jumb <- d.in[JumboAtOriginationFlag == 1,j=list(jumb=mean(OriginalInterestRate,na.rm=T)),by=c('origination.year')]
  toPlot <- merge(conf,jumb,by='origination.year')
  ggplot(toPlot) + geom_line(aes(x=origination.year,y=100 * (conf - jumb))) + theme_bw() + xlab(NULL) + ylab(NULL) + scale_y_continuous(labels = function(x) {paste0(x,'%')}) +
    scale_x_continuous(breaks = seq(2007,2017,by=2)) + geom_hline(yintercept = 0,linetype = 'dashed')
  ggsave('../output/spread_ts.png',height=3,width=5,units = 'in')
  
  
}


ficoOverTime <- function() {
  d.in <- fread('../data/black_knight_to_analyze.csv')
  byYear <- d.in[,j=list(fico.m = mean(OriginalCreditScore,na.rm=T),
                         fico.p25 = quantile(OriginalCreditScore,probs = .25,na.rm=T),
                         fico.p50 = quantile(OriginalCreditScore,probs = .50,na.rm=T),
                         fico.p75 = quantile(OriginalCreditScore,probs = .75,na.rm=T),
                         ltv.m = mean(OriginalLTV,na.rm=T),
                         ltv.p25 = quantile(OriginalLTV,probs = .25,na.rm=T),
                         ltv.p50 = quantile(OriginalLTV,probs = .50,na.rm=T),
                         ltv.p75 = quantile(OriginalLTV,probs = .75,na.rm=T)),by=c('origination.year','JumboAtOriginationFlag')]
  byYear[,Type := 'Conforming']
  byYear[JumboAtOriginationFlag == 1,Type := 'Jumbo']
  
  ggplot(byYear[JumboAtOriginationFlag == 1]) + geom_line(aes(x=origination.year,y=fico.p50)) + 
    geom_line(aes(x=origination.year,y=fico.p25),linetype = 'dashed') +
    geom_line(aes(x=origination.year,y=fico.p75),linetype = 'dashed') + coord_cartesian(ylim=c(650,800)) + theme_bw() + xlab(NULL) + ylab('FICO') + scale_x_continuous(breaks = seq(2007,2017,2))
  ggsave('../output/fico_jumbo.png',height=3,width=5,units = 'in')
  
  ggplot(byYear[JumboAtOriginationFlag == 0]) + geom_line(aes(x=origination.year,y=fico.p50)) + 
    geom_line(aes(x=origination.year,y=fico.p25),linetype = 'dashed') +
    geom_line(aes(x=origination.year,y=fico.p75),linetype = 'dashed') + coord_cartesian(ylim=c(650,800)) + theme_bw() + xlab(NULL) + ylab('FICO') + scale_x_continuous(breaks = seq(2007,2017,2))
  ggsave('../output/fico_conforming.png',height=3,width=5,units = 'in')
  
  ggplot(byYear[JumboAtOriginationFlag == 1]) + geom_line(aes(x=origination.year,y=ltv.p50)) + 
    geom_line(aes(x=origination.year,y=ltv.p25),linetype = 'dashed') +
    geom_line(aes(x=origination.year,y=ltv.p75),linetype = 'dashed') + coord_cartesian(ylim=c(0.50,0.95)) + theme_bw() + xlab(NULL) + ylab('LTV') + scale_x_continuous(breaks = seq(2007,2017,2))
  ggsave('../output/ltv_jumbo.png',height=3,width=5,units = 'in')
  
  ggplot(byYear[JumboAtOriginationFlag == 0]) + geom_line(aes(x=origination.year,y=ltv.p50)) + 
    geom_line(aes(x=origination.year,y=ltv.p25),linetype = 'dashed') +
    geom_line(aes(x=origination.year,y=ltv.p75),linetype = 'dashed') + coord_cartesian(ylim=c(0.50,0.95)) + theme_bw() + xlab(NULL) + ylab('LTV') + scale_x_continuous(breaks = seq(2007,2017,2))
  ggsave('../output/ltv_conforming.png',height=3,width=5,units = 'in')
  
  
}




rateRegression <- function(data.in) {
  rr <- felm(OriginalInterestRate ~ conf.bucket + OriginalLTV + OriginalCreditScore | OriginalTerm + PurposeOfLoanId + OccupancyId + PropertyTypeId + origination.year , data=data.in)
  #rr <- felm(OriginalInterestRate ~ conf.bucket  , data=data.in)
  
  idx <- grepl(row.names(rr$coefficients),pattern='conf\\.bucket')
  co = rr$coefficients[idx]
  se = rr$se[idx]
  na = as.numeric(gsub(row.names(rr$coefficients)[idx],pattern = 'conf\\.bucket',replace=''))
  toPlot = data.table(b = c(na,1),c = 100*c(co,0),s = 100*c(se,0))
  return(toPlot)
}


createBK <- function() {
  bk.raw <- fread('../../blackknight/Loan.txt')
  
  cols = c('LoanId','State','zip','MortgageTypeId','LoanTypeId','ProductTypeId','InterestTypeId',
           'ClosingMonth','FirstObservedInterestRate','OriginalInterestRate','OriginalTerm',
           'OriginalLoanAmount','OriginalPropertyValue','CLTV','SourceId','PropertyTypeId',
           'BalloonId','BalloonTerm','PurposeOfLoanId','OriginalCreditScore',
           'PrepaymentPenaltylauseId','PAndIFrequencyId','OccupancyId','DTIHousingRatio',
           'Recourse','DocumentationId','MICompanyId','UnderwritingId','NetServiceFee','GuaranteeFee',
           'OriginationMonth','FirstPayMonth','IsJumbo','FirstInterestResetMonth',
           'PIFMonth','PIFId','FirstObservedMonth','IsIO','UpdateDtTm','AgeAtBoarding','TeaserRate',
           'OriginalLTV','JumboAtOriginationFlag','CBSAMetroDivId')  
  
  names(bk.raw) <- cols
  
  # Get date of origination
  # Origination month is months since 12/1979
  # 1 = 1/1980.
  bk.raw[,origination.year := 1980 + floor(OriginationMonth / 12)]
  
  
  # Keep:
  ## Year in 2007:2017
  
  # Keep
  # Year in 2007:2017
  # Fixed rate non-IO, Conventional (Non-FHA, etc.), 
  ColsToKeep <- c('zip','CBSAMetroDivId','origination.year','OriginalInterestRate','OriginalTerm','OriginalLoanAmount','OriginalPropertyValue','PropertyTypeId','PurposeOfLoanId','OriginalCreditScore','OccupancyId','DTIHousingRatio',
                  'DocumentationId','IsJumbo','OriginalLTV','JumboAtOriginationFlag')
  toKeep <- bk.raw[!is.na(zip) & OriginalCreditScore != 0 & origination.year %in% 2007:2017 & LoanTypeId == 1 & ProductTypeId %in% c(3,6) & InterestTypeId == 1 & BalloonId == 0 & MortgageTypeId == 1 & SourceId != 3,ColsToKeep,with=F]
  
  toKeep[,zip3 := substr(zip,1,3)]
  
  # Load on the conforming loan limits
  conf.limits <- fread('../data/crosswalk.csv')
  conf.limits[,zip3 := substr(str_pad(zip,width=5,pad='0',side='left'),1,3)]
  conf.limits <- conf.limits[!is.na(conforming_limit)]
  conf.limits <- conf.limits[!duplicated(paste(zip3,year)),c('zip3','year','conforming_limit'),with=F]
  
  with.limits <- merge(toKeep,conf.limits,by.x=c('origination.year','zip3'),by.y=c('year','zip3'),all.x=T)
  with.limits[is.na(conforming_limit),conforming_limit := 417000]
  with.limits <- with.limits[,!c('zip'),with=F]
  
  write.table(x = with.limits,file = '../data/black_knight_to_analyze.csv',row.names=F,col.names=T)
}
